package main

import (
	. "github.com/mmcloughlin/avo/build"
	. "github.com/mmcloughlin/avo/operand"
	. "github.com/mmcloughlin/avo/reg"
)

func genSqrt_F64() {

	TEXT("Sqrt_AVX2_F64", NOSPLIT, "func(x []float64) float64")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB0_7"))
	CMPQ(RSI, Imm(4))
	JAE(LabelRef("LBB0_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB0_6"))

	Label("LBB0_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-4), RAX)
		XORL(ECX, ECX)
	}

	Label("LBB0_4")
	{
		VSQRTPD(Mem{Base: RDI}.Idx(RCX, 8), Y0)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
		ADDQ(Imm(4), RCX)
		CMPQ(RAX, RCX)
		JNE(LabelRef("LBB0_4"))
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB0_7"))
	}

	Label("LBB0_6")
	{
		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
		VSQRTSD(X0, X0, X0)
		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB0_6"))
	}

	Label("LBB0_7")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}
}

func genSqrt_F32() {
	data := GLOBL("dataSqrtF32", RODATA|NOPTR)
	DATA(0, U32(0xc0400000))
	DATA(4, U32(0xbf000000))
	DATA(8, U32(0x7fffffff))
	DATA(12, U32(0x00800000))

	TEXT("Sqrt_AVX2_F32", NOSPLIT, "func(x []float32) float32")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB1_8"))
	CMPQ(RSI, Imm(32))
	JAE(LabelRef("LBB1_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB1_6"))

	Label("LBB1_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-32), RAX)
		XORL(ECX, ECX)
		VBROADCASTSS(data.Offset(0), Y0)
		VBROADCASTSS(data.Offset(4), Y1)
		VBROADCASTSS(data.Offset(8), Y2)
		VBROADCASTSS(data.Offset(12), Y3)
	}

	Label("LBB1_4")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y4)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y5)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y6)
		VRSQRTPS(Y4, Y7)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y8)
		VMULPS(Y7, Y4, Y9)
		VFMADD213PS(Y0, Y9, Y7)
		VMULPS(Y1, Y9, Y9)
		VMULPS(Y7, Y9, Y7)
		VANDPS(Y2, Y4, Y4)
		VCMPPS(Imm(2), Y4, Y3, Y4)
		VANDPS(Y7, Y4, Y4)
		VRSQRTPS(Y5, Y7)
		VMULPS(Y7, Y5, Y9)
		VFMADD213PS(Y0, Y9, Y7)
		VMULPS(Y1, Y9, Y9)
		VMULPS(Y7, Y9, Y7)
		VANDPS(Y2, Y5, Y5)
		VCMPPS(Imm(2), Y5, Y3, Y5)
		VRSQRTPS(Y6, Y9)
		VANDPS(Y7, Y5, Y5)
		VMULPS(Y6, Y9, Y7)
		VFMADD213PS(Y0, Y7, Y9)
		VMULPS(Y1, Y7, Y7)
		VMULPS(Y7, Y9, Y7)
		VANDPS(Y2, Y6, Y6)
		VCMPPS(Imm(2), Y6, Y3, Y6)
		VANDPS(Y7, Y6, Y6)
		VRSQRTPS(Y8, Y7)
		VMULPS(Y7, Y8, Y9)
		VFMADD213PS(Y0, Y9, Y7)
		VMULPS(Y1, Y9, Y9)
		VMULPS(Y7, Y9, Y7)
		VANDPS(Y2, Y8, Y8)
		VCMPPS(Imm(2), Y8, Y3, Y8)
		VANDPS(Y7, Y8, Y7)
		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y7, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
		ADDQ(Imm(32), RCX)
		CMPQ(RAX, RCX)
		JNE(LabelRef("LBB1_4"))
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB1_8"))
	}

	Label("LBB1_6")
	{
		VMOVSS(data.Offset(0), X0)
		VMOVSS(data.Offset(4), X1)
		VBROADCASTSS(data.Offset(8), X2)
		VMOVSS(data.Offset(12), X3)
	}

	Label("LBB1_7")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4)
		VRSQRTSS(X4, X4, X5)
		VMULSS(X5, X4, X6)
		VFMADD213SS(X0, X6, X5)
		VMULSS(X1, X6, X6)
		VMULSS(X5, X6, X5)
		VANDPS(X2, X4, X4)
		VCMPSS(Imm(1), X3, X4, X4)
		VANDNPS(X5, X4, X4)
		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB1_7"))
	}

	Label("LBB1_8")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}
}

func genRound_F64() {

	data := GLOBL("dataRoundF64", RODATA|NOPTR)
	DATA(0, U64(0x8000000000000000))
	DATA(8, U64(0x3fdfffffffffffff))
	DATA(16, U64(0x8000000000000000))
	DATA(24, U64(0x8000000000000000))

	TEXT("Round_AVX2_F64", NOSPLIT, "func(x []float64) float64")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB2_8"))
	CMPQ(RSI, Imm(16))
	JAE(LabelRef("LBB2_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB2_6"))

	Label("LBB2_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-16), RAX)
		XORL(ECX, ECX)
		VBROADCASTSD(data.Offset(0), Y0)
		VBROADCASTSD(data.Offset(8), Y1)
	}

	Label("LBB2_4")
	{
		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8), Y2)
		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y3)
		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y4)
		VMOVUPD(Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y5)
		VANDPD(Y0, Y2, Y6)
		VORPD(Y1, Y6, Y6)
		VADDPD(Y6, Y2, Y2)
		VROUNDPD(Imm(11), Y2, Y2)
		VANDPD(Y0, Y3, Y6)
		VORPD(Y1, Y6, Y6)
		VADDPD(Y6, Y3, Y3)
		VROUNDPD(Imm(11), Y3, Y3)
		VANDPD(Y0, Y4, Y6)
		VORPD(Y1, Y6, Y6)
		VADDPD(Y6, Y4, Y4)
		VROUNDPD(Imm(11), Y4, Y4)
		VANDPD(Y0, Y5, Y6)
		VORPD(Y1, Y6, Y6)
		VADDPD(Y6, Y5, Y5)
		VROUNDPD(Imm(11), Y5, Y5)
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
		VMOVUPD(Y4, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
		VMOVUPD(Y5, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
		ADDQ(Imm(16), RCX)
		CMPQ(RAX, RCX)
		JNE(LabelRef("LBB2_4"))
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB2_8"))
	}

	Label("LBB2_6")
	{
		VMOVUPD(data.Offset(16), X0)
		VMOVDDUP(data.Offset(8), X1)
	}

	Label("LBB2_7")
	{
		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X2)
		VANDPD(X0, X2, X3)
		VORPD(X1, X3, X3)
		VADDSD(X3, X2, X2)
		VROUNDSD(Imm(11), X2, X2, X2)
		VMOVSD(X2, Mem{Base: RDI}.Idx(RAX, 8))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB2_7"))
	}

	Label("LBB2_8")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}
}

func genRound_F32() {

	data := GLOBL("dataRoundF32", RODATA|NOPTR)
	DATA(0, U32(0x80000000))
	DATA(4, U32(0x3effffff))

	TEXT("Round_AVX2_F32", NOSPLIT, "func(x []float32) float32")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB3_8"))
	CMPQ(RSI, Imm(32))
	JAE(LabelRef("LBB3_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB3_6"))

	Label("LBB3_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-32), RAX)
		XORL(ECX, ECX)
		VBROADCASTSS(data.Offset(0), Y0)
		VBROADCASTSS(data.Offset(4), Y1)
	}

	Label("LBB3_4")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y3)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y4)
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y5)
		VANDPS(Y0, Y2, Y6)
		VORPS(Y1, Y6, Y6)
		VADDPS(Y6, Y2, Y2)
		VROUNDPS(Imm(11), Y2, Y2)
		VANDPS(Y0, Y3, Y6)
		VORPS(Y1, Y6, Y6)
		VADDPS(Y6, Y3, Y3)
		VROUNDPS(Imm(11), Y3, Y3)
		VANDPS(Y0, Y4, Y6)
		VORPS(Y1, Y6, Y6)
		VADDPS(Y6, Y4, Y4)
		VROUNDPS(Imm(11), Y4, Y4)
		VANDPS(Y0, Y5, Y6)
		VORPS(Y1, Y6, Y6)
		VADDPS(Y6, Y5, Y5)
		VROUNDPS(Imm(11), Y5, Y5)
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y5, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
		ADDQ(Imm(32), RCX)
		CMPQ(RAX, RCX)
		JNE(LabelRef("LBB3_4"))
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB3_8"))
	}

	Label("LBB3_6")
	{
		VBROADCASTSS(data.Offset(0), X0)
		VBROADCASTSS(data.Offset(4), X1)
	}

	Label("LBB3_7")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X2)
		VANDPS(X0, X2, X3)
		VORPS(X1, X3, X3)
		VADDSS(X3, X2, X2)
		VROUNDSS(Imm(11), X2, X2, X2)
		VMOVSS(X2, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB3_7"))
	}

	Label("LBB3_8")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}
}

func genFloor_F64() {

	TEXT("Floor_AVX2_F64", NOSPLIT, "func(x []float64) float64")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB4_11"))
	CMPQ(RSI, Imm(16))
	JAE(LabelRef("LBB4_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB4_10"))

	Label("LBB4_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-16), RAX)
		LEAQ(Mem{Base: RAX}.Offset(-16), RCX)
		MOVQ(RCX, R8)
		SHRQ(Imm(4), R8)
		ADDQ(Imm(1), R8)
		TESTQ(RCX, RCX)
		JE(LabelRef("LBB4_4"))
		MOVQ(R8, RDX)
		ANDQ(I32(-2), RDX)
		XORL(ECX, ECX)
	}

	Label("LBB4_6")
	{
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224))
		ADDQ(Imm(32), RCX)
		ADDQ(I32(-2), RDX)
		JNE(LabelRef("LBB4_6"))
		TESTB(Imm(1), R8B)
		JE(LabelRef("LBB4_9"))
	}

	Label("LBB4_8")
	{
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8), Y0)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
		VROUNDPD(Imm(9), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
	}

	Label("LBB4_9")
	{
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB4_11"))
	}

	Label("LBB4_10")
	{
		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
		VROUNDSD(Imm(9), X0, X0, X0)
		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB4_10"))
	}

	Label("LBB4_11")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}

	Label("LBB4_4")
	{
		XORL(ECX, ECX)
		TESTB(Imm(1), R8B)
		JNE(LabelRef("LBB4_8"))
		JMP(LabelRef("LBB4_9"))
	}
}

func genFloor_F32() {

	TEXT("Floor_AVX2_F32", NOSPLIT, "func(x []float32) float32")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB5_11"))
	CMPQ(RSI, Imm(32))
	JAE(LabelRef("LBB5_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB5_10"))

	Label("LBB5_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-32), RAX)
		LEAQ(Mem{Base: RAX}.Offset(-32), RCX)
		MOVQ(RCX, R8)
		SHRQ(Imm(5), R8)
		ADDQ(Imm(1), R8)
		TESTQ(RCX, RCX)
		JE(LabelRef("LBB5_4"))
		MOVQ(R8, RDX)
		ANDQ(I32(-2), RDX)
		XORL(ECX, ECX)
	}

	Label("LBB5_6")
	{
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224))
		ADDQ(Imm(64), RCX)
		ADDQ(I32(-2), RDX)
		JNE(LabelRef("LBB5_6"))
		TESTB(Imm(1), R8B)
		JE(LabelRef("LBB5_9"))
	}

	Label("LBB5_8")
	{
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4), Y0)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
		VROUNDPS(Imm(9), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
	}

	Label("LBB5_9")
	{
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB5_11"))
	}

	Label("LBB5_10")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0)
		VROUNDSS(Imm(9), X0, X0, X0)
		VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB5_10"))
	}

	Label("LBB5_11")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}

	Label("LBB5_4")
	{
		XORL(ECX, ECX)
		TESTB(Imm(1), R8B)
		JNE(LabelRef("LBB5_8"))
		JMP(LabelRef("LBB5_9"))
	}
}

func genCeil_F64() {

	TEXT("Ceil_AVX2_F64", NOSPLIT, "func(x []float64) float64")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB6_11"))
	CMPQ(RSI, Imm(16))
	JAE(LabelRef("LBB6_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB6_10"))

	Label("LBB6_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-16), RAX)
		LEAQ(Mem{Base: RAX}.Offset(-16), RCX)
		MOVQ(RCX, R8)
		SHRQ(Imm(4), R8)
		ADDQ(Imm(1), R8)
		TESTQ(RCX, RCX)
		JE(LabelRef("LBB6_4"))
		MOVQ(R8, RDX)
		ANDQ(I32(-2), RDX)
		XORL(ECX, ECX)
	}

	Label("LBB6_6")
	{
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(128), Y0)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(160), Y1)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(192), Y2)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(224), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8).Offset(128))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(160))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(192))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(224))
		ADDQ(Imm(32), RCX)
		ADDQ(I32(-2), RDX)
		JNE(LabelRef("LBB6_6"))
		TESTB(Imm(1), R8B)
		JE(LabelRef("LBB6_9"))
	}

	Label("LBB6_8")
	{
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8), Y0)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(32), Y1)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(64), Y2)
		VROUNDPD(Imm(10), Mem{Base: RDI}.Idx(RCX, 8).Offset(96), Y3)
		VMOVUPD(Y0, Mem{Base: RDI}.Idx(RCX, 8))
		VMOVUPD(Y1, Mem{Base: RDI}.Idx(RCX, 8).Offset(32))
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(RCX, 8).Offset(64))
		VMOVUPD(Y3, Mem{Base: RDI}.Idx(RCX, 8).Offset(96))
	}

	Label("LBB6_9")
	{
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB6_11"))
	}

	Label("LBB6_10")
	{
		VMOVSD(Mem{Base: RDI}.Idx(RAX, 8), X0)
		VROUNDSD(Imm(10), X0, X0, X0)
		VMOVSD(X0, Mem{Base: RDI}.Idx(RAX, 8))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB6_10"))
	}

	Label("LBB6_11")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}

	Label("LBB6_4")
	{
		XORL(ECX, ECX)
		TESTB(Imm(1), R8B)
		JNE(LabelRef("LBB6_8"))
		JMP(LabelRef("LBB6_9"))
	}
}

func genCeil_F32() {

	TEXT("Ceil_AVX2_F32", NOSPLIT, "func(x []float32) float32")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB7_11"))
	CMPQ(RSI, Imm(32))
	JAE(LabelRef("LBB7_3"))
	XORL(EAX, EAX)
	JMP(LabelRef("LBB7_10"))

	Label("LBB7_3")
	{
		MOVQ(RSI, RAX)
		ANDQ(I32(-32), RAX)
		LEAQ(Mem{Base: RAX}.Offset(-32), RCX)
		MOVQ(RCX, R8)
		SHRQ(Imm(5), R8)
		ADDQ(Imm(1), R8)
		TESTQ(RCX, RCX)
		JE(LabelRef("LBB7_4"))
		MOVQ(R8, RDX)
		ANDQ(I32(-2), RDX)
		XORL(ECX, ECX)
	}

	Label("LBB7_6")
	{
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(128), Y0)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(160), Y1)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(192), Y2)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(224), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4).Offset(128))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(160))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(192))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(224))
		ADDQ(Imm(64), RCX)
		ADDQ(I32(-2), RDX)
		JNE(LabelRef("LBB7_6"))
		TESTB(Imm(1), R8B)
		JE(LabelRef("LBB7_9"))
	}

	Label("LBB7_8")
	{
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4), Y0)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(32), Y1)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(64), Y2)
		VROUNDPS(Imm(10), Mem{Base: RDI}.Idx(RCX, 4).Offset(96), Y3)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		VMOVUPS(Y1, Mem{Base: RDI}.Idx(RCX, 4).Offset(32))
		VMOVUPS(Y2, Mem{Base: RDI}.Idx(RCX, 4).Offset(64))
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RCX, 4).Offset(96))
	}

	Label("LBB7_9")
	{
		CMPQ(RAX, RSI)
		JE(LabelRef("LBB7_11"))
	}

	Label("LBB7_10")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X0)
		VROUNDSS(Imm(10), X0, X0, X0)
		VMOVSS(X0, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(1), RAX)
		CMPQ(RSI, RAX)
		JNE(LabelRef("LBB7_10"))
	}

	Label("LBB7_11")
	{
		VZEROUPPER()
		Store(X0, ReturnIndex(0))
		RET()
	}

	Label("LBB7_4")
	{
		XORL(ECX, ECX)
		TESTB(Imm(1), R8B)
		JNE(LabelRef("LBB7_8"))
		JMP(LabelRef("LBB7_9"))
	}
}

func genPow_4x_F64() {

	data := GLOBL("dataPowF64", RODATA|NOPTR)
	DATA(0, U64(9223372036854775807))   // Label("LCPI9_0")
	DATA(8, U64(0x3fe6a09e667f3bcd))    // Label("LCPI9_3")
	DATA(16, U64(0xbff0000000000000))   // Label("LCPI9_4")
	DATA(24, U64(0x401a509f46f4fa53))   // Label("LCPI9_5")
	DATA(32, U64(0x3fdfe818a0fe1a83))   // Label("LCPI9_6")
	DATA(40, U64(0x3f07bc0962b395ca))   // Label("LCPI9_7")
	DATA(48, U64(0x404e798eb86c3351))   // Label("LCPI9_8")
	DATA(56, U64(0x403de9738b8cb9c9))   // Label("LCPI9_9")
	DATA(64, U64(0x40340a202d99830a))   // Label("LCPI9_10")
	DATA(72, U64(0x404c8e7597479a10))   // Label("LCPI9_11")
	DATA(80, U64(0x4054c30b52213498))   // Label("LCPI9_12")
	DATA(88, U64(0x402e20359e903e37))   // Label("LCPI9_13")
	DATA(96, U64(0x407351945dc908a5))   // Label("LCPI9_14")
	DATA(104, U64(0x406bb86590fcfb56))  // Label("LCPI9_15")
	DATA(112, U64(0x404e0f304466448e))  // Label("LCPI9_16")
	DATA(120, U64(0x406b0db13e48e066))  // Label("LCPI9_17")
	DATA(128, U64(4841369599423283200)) // Label("LCPI9_18")
	DATA(136, U64(0xc3300000000003ff))  // Label("LCPI9_19")
	DATA(144, U64(0x3ff0000000000000))  // Label("LCPI9_20")
	DATA(152, U64(0xbfe0000000000000))  // Label("LCPI9_21")
	DATA(160, U64(0x3fe0000000000000))  // Label("LCPI9_22")
	DATA(168, U64(0x3ff71547652b82fe))  // Label("LCPI9_23")
	DATA(176, U64(0xbfe62e4000000000))  // Label("LCPI9_24")
	DATA(184, U64(0x3eb7f7d1cf79abca))  // Label("LCPI9_25")
	DATA(192, U64(0x3fe62e42fefa39ef))  // Label("LCPI9_26")
	DATA(200, U64(0x3e21eed8eff8d898))  // Label("LCPI9_27")
	DATA(208, U64(0x3de6124613a86d09))  // Label("LCPI9_28")
	DATA(216, U64(0x3e927e4fb7789f5c))  // Label("LCPI9_29")
	DATA(224, U64(0x3e5ae64567f544e4))  // Label("LCPI9_30")
	DATA(232, U64(0x3efa01a01a01a01a))  // Label("LCPI9_31")
	DATA(240, U64(0x3ec71de3a556c734))  // Label("LCPI9_32")
	DATA(248, U64(0x3f56c16c16c16c17))  // Label("LCPI9_33")
	DATA(256, U64(0x3f2a01a01a01a01a))  // Label("LCPI9_34")
	DATA(264, U64(0x3fa5555555555555))  // Label("LCPI9_35")
	DATA(272, U64(0x3f81111111111111))  // Label("LCPI9_36")
	DATA(280, U64(0x3fc5555555555555))  // Label("LCPI9_37")
	DATA(288, U64(2046))                // Label("LCPI9_38")
	DATA(296, U64(0x40a7700000000000))  // Label("LCPI9_39")
	DATA(304, U64(1))                   // Label("LCPI9_40")
	DATA(312, U64(0xc0a7700000000000))  // Label("LCPI9_41")
	DATA(320, U64(9218868437227405312)) // Label("LCPI9_42")
	DATA(328, U64(0x7ff8002040000000))  // Label("LCPI9_43")
	DATA(336, U64(4503599627370495))    // Label("LCPI9_1")
	DATA(344, U64(4503599627370495))
	DATA(352, U64(4602678819172646912)) // Label("LCPI9_2")
	DATA(360, U64(4602678819172646912))

	TEXT("Pow_4x_AVX2_F64", NOSPLIT, "func(x, y []float64)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("y").Base(), RSI)
	Load(Param("x").Len(), RDX)

	SUBQ(I32(1192), RSP)
	ANDQ(I32(-4), RDX)
	JE(LabelRef("LBB9_11"))
	XORL(R8L, R8L)
	VBROADCASTSD(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(512))
	VBROADCASTSD(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1120))
	VPXOR(X6, X6, X6)
	VBROADCASTSD(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1088))
	VBROADCASTSD(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1056))
	VBROADCASTSD(data.Offset(32), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(1024))
	VBROADCASTSD(data.Offset(40), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(992))
	VBROADCASTSD(data.Offset(48), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(960))
	VBROADCASTSD(data.Offset(56), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(928))
	VBROADCASTSD(data.Offset(64), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(896))
	VBROADCASTSD(data.Offset(72), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(864))
	VBROADCASTSD(data.Offset(80), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(832))
	VBROADCASTSD(data.Offset(88), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(800))
	VBROADCASTSD(data.Offset(96), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(768))
	VBROADCASTSD(data.Offset(104), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(736))
	VBROADCASTSD(data.Offset(112), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(704))
	VBROADCASTSD(data.Offset(120), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(672))
	VBROADCASTSD(data.Offset(128), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(640))
	VBROADCASTSD(data.Offset(136), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(608))
	VBROADCASTSD(data.Offset(144), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSD(data.Offset(152), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(576))
	VBROADCASTSD(data.Offset(160), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(544))
	VBROADCASTSD(data.Offset(168), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(480))
	VBROADCASTSD(data.Offset(176), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(448))
	VBROADCASTSD(data.Offset(184), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(416))
	VBROADCASTSD(data.Offset(192), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(384))
	VBROADCASTSD(data.Offset(200), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(352))
	VBROADCASTSD(data.Offset(208), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(320))
	VBROADCASTSD(data.Offset(216), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(288))
	VBROADCASTSD(data.Offset(224), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(256))
	VBROADCASTSD(data.Offset(232), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(224))
	VBROADCASTSD(data.Offset(240), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(192))
	VBROADCASTSD(data.Offset(248), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(160))
	VBROADCASTSD(data.Offset(256), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(128))
	VBROADCASTSD(data.Offset(264), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
	VBROADCASTSD(data.Offset(272), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSD(data.Offset(280), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSD(data.Offset(288), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSD(data.Offset(296), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSD(data.Offset(304), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSD(data.Offset(312), Y0)
	VMOVUPD(Y0, Mem{Base: RSP}.Offset(-96))
	VPBROADCASTQ(data.Offset(320), Y5)
	VBROADCASTSD(data.Offset(320), Y10)
	JMP(LabelRef("LBB9_2"))

	Label("LBB9_10")
	{
		VMOVUPD(Y2, Mem{Base: RDI}.Idx(R8, 8))
		ADDQ(Imm(4), R8)
		CMPQ(R8, RDX)
		JAE(LabelRef("LBB9_11"))
	}

	Label("LBB9_2")
	{
		VMOVAPD(Y10, Y9)
		VMOVDQU(Mem{Base: RDI}.Idx(R8, 8), Y13)
		VMOVUPD(Mem{Base: RSI}.Idx(R8, 8), Y12)
		VPAND(Mem{Base: RSP}.Offset(512), Y13, Y10)
		VMOVUPD(data.Offset(336), X1)
		VANDPD(Mem{Base: RDI}.Idx(R8, 8), X1, X2)
		VMOVUPD(data.Offset(352), X0)
		VORPD(X0, X2, X2)
		VANDPD(Mem{Base: RDI}.Idx(R8, 8).Offset(16), X1, X3)
		VORPD(X0, X3, X3)
		VINSERTF128(Imm(1), X3, Y2, Y3)
		VMOVUPD(Mem{Base: RSP}.Offset(1120), Y0)
		VCMPPD(Imm(1), Y3, Y0, Y2)
		VANDNPD(Y3, Y2, Y4)
		VADDPD(Mem{Base: RSP}.Offset(1088), Y3, Y3)
		VADDPD(Y4, Y3, Y4)
		VMULPD(Y4, Y4, Y3)
		VMULPD(Y3, Y3, Y7)
		VMOVUPD(Mem{Base: RSP}.Offset(1024), Y8)
		VFMADD213PD(Mem{Base: RSP}.Offset(1056), Y4, Y8)
		VFMADD231PD(Mem{Base: RSP}.Offset(992), Y3, Y8)
		VMOVUPD(Mem{Base: RSP}.Offset(928), Y11)
		VFMADD213PD(Mem{Base: RSP}.Offset(960), Y4, Y11)
		VMOVUPD(Mem{Base: RSP}.Offset(864), Y14)
		VFMADD213PD(Mem{Base: RSP}.Offset(896), Y4, Y14)
		VFMADD231PD(Y11, Y3, Y14)
		VFMADD231PD(Y8, Y7, Y14)
		VMULPD(Y4, Y3, Y8)
		VMULPD(Y14, Y8, Y8)
		VADDPD(Mem{Base: RSP}.Offset(832), Y3, Y11)
		VFMADD231PD(Mem{Base: RSP}.Offset(800), Y4, Y11)
		VMOVUPD(Mem{Base: RSP}.Offset(736), Y14)
		VFMADD213PD(Mem{Base: RSP}.Offset(768), Y4, Y14)
		VMOVUPD(Mem{Base: RSP}.Offset(672), Y15)
		VFMADD213PD(Mem{Base: RSP}.Offset(704), Y4, Y15)
		VFMADD231PD(Y14, Y3, Y15)
		VFMADD231PD(Y11, Y7, Y15)
		VDIVPD(Y15, Y8, Y7)
		VMOVDQU(Y10, Mem{Base: RSP}.Offset(1152))
		VPSRLQ(Imm(52), Y10, Y8)
		VPOR(Mem{Base: RSP}.Offset(640), Y8, Y8)
		VADDPD(Mem{Base: RSP}.Offset(608), Y8, Y8)
		VMOVUPD(Mem{Base: RSP}.Offset(-128), Y0)
		VANDPD(Y0, Y2, Y2)
		VADDPD(Y2, Y8, Y8)
		VMULPD(Y12, Y8, Y2)
		VROUNDPD(Imm(8), Y2, Y2)
		VFNMADD213PD(Y2, Y12, Y8)
		VMOVUPD(Mem{Base: RSP}.Offset(576), Y1)
		VMOVAPD(Y1, Y11)
		VFMADD213PD(Y4, Y3, Y11)
		VADDPD(Y7, Y11, Y11)
		VMOVUPD(Mem{Base: RSP}.Offset(544), Y10)
		VMULPD(Y4, Y10, Y14)
		VMULPD(Y1, Y3, Y15)
		VFMADD231PD(Y14, Y4, Y15)
		VSUBPD(Y4, Y11, Y4)
		VFMADD231PD(Y3, Y10, Y4)
		VMOVUPD(Mem{Base: RSP}.Offset(480), Y1)
		VMULPD(Y1, Y12, Y3)
		VMULPD(Y3, Y11, Y3)
		VROUNDPD(Imm(8), Y3, Y3)
		VMULPD(Mem{Base: RSP}.Offset(448), Y3, Y14)
		VFMADD231PD(Y11, Y12, Y14)
		VFMSUB231PD(Mem{Base: RSP}.Offset(416), Y3, Y14)
		VMOVUPD(Mem{Base: RSP}.Offset(384), Y11)
		VFMADD231PD(Y8, Y11, Y14)
		VSUBPD(Y7, Y15, Y7)
		VADDPD(Y4, Y7, Y4)
		VFNMSUB213PD(Y14, Y12, Y4)
		VMULPD(Y1, Y4, Y7)
		VROUNDPD(Imm(8), Y7, Y7)
		VFNMADD231PD(Y11, Y7, Y4)
		VMULPD(Y4, Y4, Y8)
		VMOVUPD(Mem{Base: RSP}.Offset(320), Y11)
		VFMADD213PD(Mem{Base: RSP}.Offset(352), Y4, Y11)
		VMOVUPD(Mem{Base: RSP}.Offset(256), Y14)
		VFMADD213PD(Mem{Base: RSP}.Offset(288), Y4, Y14)
		VMOVUPD(Mem{Base: RSP}.Offset(192), Y15)
		VFMADD213PD(Mem{Base: RSP}.Offset(224), Y4, Y15)
		VFMADD231PD(Y14, Y8, Y15)
		VMOVUPD(Mem{Base: RSP}.Offset(128), Y14)
		VFMADD213PD(Mem{Base: RSP}.Offset(160), Y4, Y14)
		VMOVUPD(Mem{Base: RSP}.Offset(64), Y1)
		VFMADD213PD(Mem{Base: RSP}.Offset(96), Y4, Y1)
		VFMADD231PD(Y14, Y8, Y1)
		VMOVUPD(Mem{Base: RSP}.Offset(32), Y14)
		VFMADD213PD(Y10, Y4, Y14)
		VFMADD213PD(Y4, Y8, Y14)
		VMULPD(Y8, Y8, Y4)
		VFMADD231PD(Y11, Y4, Y15)
		VFMADD231PD(Y1, Y4, Y14)
		VMULPD(Y4, Y4, Y1)
		VFMADD231PD(Y15, Y1, Y14)
		VADDPD(Y0, Y14, Y1)
		VADDPD(Y2, Y3, Y2)
		VADDPD(Y7, Y2, Y15)
		VROUNDPD(Imm(8), Y15, Y2)
		VCVTTSD2SIQ(X2, R9)
		VPERMILPD(Imm(1), X2, X3)
		VCVTTSD2SIQ(X3, RAX)
		VEXTRACTF128(Imm(1), Y2, X2)
		VCVTTSD2SIQ(X2, RCX)
		VMOVQ(RCX, X3)
		VPERMILPD(Imm(1), X2, X2)
		VCVTTSD2SIQ(X2, RCX)
		VMOVQ(RCX, X2)
		VPUNPCKLQDQ(X2, X3, X2)
		VMOVQ(R9, X3)
		VMOVQ(RAX, X4)
		VPUNPCKLQDQ(X4, X3, X3)
		VINSERTI128(Imm(1), X2, Y3, Y2)
		VPSRAD(Imm(31), Y1, Y3)
		VPSRAD(Imm(20), Y1, Y4)
		VPSRLQ(Imm(32), Y4, Y4)
		VPBLENDD(Imm(170), Y3, Y4, Y3)
		VPADDQ(Y3, Y2, Y4)
		VPCMPGTQ(Mem{Base: RSP}, Y4, Y3)
		VMOVUPD(Mem{Base: RSP}.Offset(-32), Y0)
		VCMPPD(Imm(1), Y15, Y0, Y7)
		VPOR(Y7, Y3, Y3)
		VMOVDQU(Mem{Base: RSP}.Offset(-64), Y0)
		VPCMPGTQ(Y4, Y0, Y4)
		VCMPPD(Imm(1), Mem{Base: RSP}.Offset(-96), Y15, Y7)
		VPOR(Y7, Y4, Y4)
		VPSLLQ(Imm(52), Y2, Y2)
		VPADDQ(Y1, Y2, Y2)
		VPOR(Y3, Y4, Y1)
		VPTEST(Y1, Y1)
		JNE(LabelRef("LBB9_3"))
		VMOVAPD(Y9, Y10)
		JMP(LabelRef("LBB9_5"))
	}

	Label("LBB9_3")
	{
		VPANDN(Y2, Y4, Y1)
		VMOVAPD(Y9, Y10)
		VBLENDVPD(Y3, Y9, Y1, Y2)
	}

	Label("LBB9_5")
	{
		VPAND(Y5, Y13, Y11)
		VPCMPEQQ(Y6, Y11, Y4)
		VPSRAD(Imm(31), Y13, Y1)
		VPSHUFD(Imm(245), Y1, Y7)
		VCMPPD(Imm(1), Y6, Y12, Y14)
		VCMPPD(Imm(0), Y6, Y12, Y3)
		VANDPD(Mem{Base: RSP}.Offset(-128), Y3, Y1)
		VBLENDVPD(Y14, Y10, Y1, Y1)
		VBLENDVPD(Y4, Y1, Y2, Y2)
		VPTEST(Y7, Y7)
		JNE(LabelRef("LBB9_7"))
		VPXOR(X7, X7, X7)
		JMP(LabelRef("LBB9_8"))
	}

	Label("LBB9_7")
	{
		VROUNDPD(Imm(8), Y12, Y1)
		VCMPPD(Imm(0), Y1, Y12, Y8)
		VCVTTSD2SIQ(X1, R9)
		VPERMILPD(Imm(1), X1, X10)
		VCVTTSD2SIQ(X10, RCX)
		VEXTRACTF128(Imm(1), Y1, X1)
		VCVTTSD2SIQ(X1, RAX)
		VXORPD(X10, X10, X10)
		VMOVQ(RAX, X6)
		VPERMILPD(Imm(1), X1, X1)
		VCVTTSD2SIQ(X1, RAX)
		VMOVQ(RAX, X1)
		VPUNPCKLQDQ(X1, X6, X1)
		VMOVQ(R9, X6)
		VMOVQ(RCX, X0)
		VPUNPCKLQDQ(X0, X6, X0)
		VINSERTI128(Imm(1), X1, Y0, Y0)
		VPSLLQ(Imm(63), Y0, Y0)
		VPOR(Y2, Y0, Y1)
		VCMPPD(Imm(0), Y10, Y13, Y6)
		VBROADCASTSD(data.Offset(328), Y10)
		VBLENDVPD(Y6, Y2, Y10, Y6)
		VMOVAPD(Y9, Y10)
		VBLENDVPD(Y8, Y1, Y6, Y1)
		VXORPD(X6, X6, X6)
		VBLENDVPD(Y7, Y1, Y2, Y2)
		VANDPD(Y0, Y8, Y7)
	}

	Label("LBB9_8")
	{
		VPCMPEQD(Y9, Y9, Y9)
		VANDPD(Y5, Y12, Y0)
		VANDPD(Y5, Y15, Y1)
		VPCMPEQQ(Y5, Y1, Y15)
		VPXOR(Y9, Y15, Y1)
		VPCMPEQQ(Y5, Y0, Y8)
		VPCMPEQQ(Y5, Y11, Y11)
		VPXOR(Y9, Y11, Y0)
		VPANDN(Y0, Y8, Y0)
		VPOR(Y4, Y1, Y1)
		VPAND(Y0, Y1, Y0)
		VPTEST(Y9, Y0)
		JB(LabelRef("LBB9_10"))
		VPXOR(Y9, Y8, Y0)
		VPANDN(Y0, Y15, Y0)
		VMOVUPD(Mem{Base: RSP}.Offset(-128), Y8)
		VMOVUPD(Mem{Base: RSP}.Offset(1152), Y9)
		VCMPPD(Imm(0), Y8, Y9, Y1)
		VCMPPD(Imm(1), Y9, Y8, Y4)
		VPSRAD(Imm(31), Y12, Y6)
		VPXOR(Y4, Y6, Y4)
		VPXOR(X6, X6, X6)
		VBLENDVPD(Y4, Y10, Y6, Y4)
		VBLENDVPD(Y1, Y8, Y4, Y1)
		VBLENDVPD(Y0, Y2, Y1, Y0)
		VANDPD(Y2, Y7, Y1)
		VANDPD(Y7, Y13, Y2)
		VORPD(Y2, Y9, Y2)
		VBLENDVPD(Y14, Y1, Y2, Y1)
		VBLENDVPD(Y3, Y8, Y1, Y1)
		VBLENDVPD(Y11, Y1, Y0, Y0)
		VCMPPD(Imm(3), Y13, Y13, Y1)
		VCMPPD(Imm(3), Y12, Y12, Y2)
		VORPD(Y1, Y2, Y1)
		VADDPD(Y13, Y12, Y2)
		VBLENDVPD(Y1, Y2, Y0, Y2)
		JMP(LabelRef("LBB9_10"))
	}

	Label("LBB9_11")
	{
		ADDQ(I32(1192), RSP)
		VZEROUPPER()
		RET()
	}
}

func genPow_8x_F32() {

	data := GLOBL("genPowF32", RODATA|NOPTR)

	DATA(0, U32(2147483647))   // Label("LCPI8_0")
	DATA(4, U32(0x3f3504f3))   // Label("LCPI8_3")
	DATA(8, U32(0xbf800000))   // Label("LCPI8_4")
	DATA(12, U32(0x3def251a))  // Label("LCPI8_5")
	DATA(16, U32(0xbdebd1b8))  // Label("LCPI8_6")
	DATA(20, U32(0x3e11e9bf))  // Label("LCPI8_7")
	DATA(24, U32(0xbdfe5d4f))  // Label("LCPI8_8")
	DATA(28, U32(0x3e4cceac))  // Label("LCPI8_9")
	DATA(32, U32(0xbe2aae50))  // Label("LCPI8_10")
	DATA(36, U32(0x3eaaaaaa))  // Label("LCPI8_11")
	DATA(40, U32(0xbe7ffffc))  // Label("LCPI8_12")
	DATA(44, U32(0x3d9021bb))  // Label("LCPI8_13")
	DATA(48, U32(0xcb00007f))  // Label("LCPI8_15")
	DATA(52, U32(0x3f800000))  // Label("LCPI8_16")
	DATA(56, U32(0xbf000000))  // Label("LCPI8_17")
	DATA(60, U32(0x3f000000))  // Label("LCPI8_18")
	DATA(64, U32(0x3fb8aa3b))  // Label("LCPI8_19")
	DATA(68, U32(0xbf318000))  // Label("LCPI8_20")
	DATA(72, U32(0xb95e8083))  // Label("LCPI8_21")
	DATA(76, U32(0xbf317218))  // Label("LCPI8_22")
	DATA(80, U32(0x3d2aaaab))  // Label("LCPI8_23")
	DATA(84, U32(0x3c088889))  // Label("LCPI8_24")
	DATA(88, U32(0x3ab60b61))  // Label("LCPI8_25")
	DATA(92, U32(0x39500d01))  // Label("LCPI8_26")
	DATA(96, U32(0x3e2aaaab))  // Label("LCPI8_27")
	DATA(100, U32(254))        // Label("LCPI8_29")
	DATA(104, U32(0x43960000)) // Label("LCPI8_30")
	DATA(108, U32(1))          // Label("LCPI8_31")
	DATA(112, U32(0xc3960000)) // Label("LCPI8_32")
	DATA(116, U32(2139095040)) // Label("LCPI8_33")
	DATA(120, U32(0x7fc00102)) // Label("LCPI8_34")

	DATA(124, U64(36028792732385279)) // Label("LCPI8_1")
	DATA(132, U64(36028792732385279))

	DATA(140, U64(4539628425446424576)) // Label("LCPI8_2")
	DATA(148, U64(4539628425446424576))

	DATA(156, U64(5404319554102886400)) // Label("LCPI8_14")

	DATA(164, U8(255)) // Label("LCPI8_28")
	DATA(165, U8(0))
	DATA(166, U8(0))
	DATA(167, U8(0))
	DATA(168, U8(255))
	DATA(169, U8(0))
	DATA(170, U8(0))
	DATA(171, U8(0))
	DATA(172, U8(255))
	DATA(173, U8(0))
	DATA(174, U8(0))
	DATA(175, U8(0))
	DATA(176, U8(255))
	DATA(177, U8(0))
	DATA(178, U8(0))
	DATA(179, U8(0))
	DATA(180, U8(255))
	DATA(181, U8(0))
	DATA(182, U8(0))
	DATA(183, U8(0))
	DATA(184, U8(255))
	DATA(185, U8(0))
	DATA(186, U8(0))
	DATA(187, U8(0))
	DATA(188, U8(255))
	DATA(189, U8(0))
	DATA(190, U8(0))
	DATA(191, U8(0))
	DATA(192, U8(255))
	DATA(193, U8(0))
	DATA(194, U8(0))
	DATA(195, U8(0))

	TEXT("Pow_8x_AVX2_F32", NOSPLIT, "func(x, y []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("y").Base(), RSI)
	Load(Param("x").Len(), RDX)

	SUBQ(I32(872), RSP)
	ANDQ(I32(-8), RDX)
	JE(LabelRef("LBB8_12"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(320))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(800))
	VPXOR(X7, X7, X7)
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(768))
	VBROADCASTSS(data.Offset(12), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(736))
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(704))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(672))
	VBROADCASTSS(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(640))
	VBROADCASTSS(data.Offset(28), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(608))
	VBROADCASTSS(data.Offset(32), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(576))
	VBROADCASTSS(data.Offset(36), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(544))
	VBROADCASTSS(data.Offset(40), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(512))
	VBROADCASTSS(data.Offset(44), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(480))
	VBROADCASTSD(data.Offset(156), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(448))
	VBROADCASTSS(data.Offset(48), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(416))
	VBROADCASTSS(data.Offset(52), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(56), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(384))
	VBROADCASTSS(data.Offset(60), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(352))
	VBROADCASTSS(data.Offset(64), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(288))
	VBROADCASTSS(data.Offset(68), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(256))
	VBROADCASTSS(data.Offset(72), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(224))
	VBROADCASTSS(data.Offset(76), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(192))
	VBROADCASTSS(data.Offset(80), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(160))
	VBROADCASTSS(data.Offset(84), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(128))
	VBROADCASTSS(data.Offset(88), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
	VBROADCASTSS(data.Offset(92), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSS(data.Offset(96), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(100), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSS(data.Offset(104), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(108), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VPBROADCASTD(data.Offset(112), Y0)
	VMOVDQU(Y0, Mem{Base: RSP}.Offset(-96))
	VPBROADCASTD(data.Offset(116), Y8)
	VBROADCASTSS(data.Offset(116), Y12)
	JMP(LabelRef("LBB8_2"))

	Label("LBB8_10")
	{
		VPXOR(Y0, Y15, Y0)
		VPANDN(Y0, Y14, Y0)
		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y14)
		VMOVUPS(Mem{Base: RSP}.Offset(832), Y2)
		VCMPPS(Imm(0), Y2, Y14, Y3)
		VCMPPS(Imm(1), Y2, Y14, Y4)
		VXORPS(Y4, Y11, Y4)
		VPXOR(X7, X7, X7)
		VBLENDVPS(Y4, Y12, Y7, Y4)
		VBLENDVPS(Y3, Y14, Y4, Y3)
		VBLENDVPS(Y0, Y6, Y3, Y0)
		VANDPS(Y6, Y10, Y3)
		VANDPS(Y9, Y10, Y4)
		VORPS(Y2, Y4, Y4)
		VBLENDVPS(Y13, Y3, Y4, Y3)
		VBLENDVPS(Y1, Y14, Y3, Y1)
		VBLENDVPS(Y5, Y0, Y1, Y0)
		VCMPPS(Imm(3), Y9, Y9, Y1)
		VCMPPS(Imm(3), Y11, Y11, Y3)
		VORPS(Y1, Y3, Y1)
		VADDPS(Y9, Y11, Y3)
		VBLENDVPS(Y1, Y3, Y0, Y6)
		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RDX)
		JAE(LabelRef("LBB8_12"))
	}

	Label("LBB8_2")
	{
		VMOVAPS(Y12, Y2)
		VMOVDQU(Mem{Base: RDI}.Idx(RAX, 4), Y9)
		VMOVUPS(Mem{Base: RSI}.Idx(RAX, 4), Y11)
		VPAND(Mem{Base: RSP}.Offset(320), Y9, Y12)
		VMOVUPS(data.Offset(124), X1)
		VANDPS(Mem{Base: RDI}.Idx(RAX, 4), X1, X0)
		VMOVUPS(data.Offset(140), X3)
		VORPS(X3, X0, X0)
		VANDPS(Mem{Base: RDI}.Idx(RAX, 4).Offset(16), X1, X1)
		VORPS(X3, X1, X1)
		VINSERTF128(Imm(1), X1, Y0, Y0)
		VMOVUPS(Mem{Base: RSP}.Offset(800), Y1)
		VCMPPS(Imm(1), Y0, Y1, Y1)
		VANDNPS(Y0, Y1, Y4)
		VADDPS(Mem{Base: RSP}.Offset(768), Y0, Y0)
		VADDPS(Y4, Y0, Y4)
		VMULPS(Y4, Y4, Y6)
		VMULPS(Y6, Y6, Y0)
		VMOVUPS(Mem{Base: RSP}.Offset(704), Y5)
		VFMADD213PS(Mem{Base: RSP}.Offset(736), Y4, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(640), Y10)
		VFMADD213PS(Mem{Base: RSP}.Offset(672), Y4, Y10)
		VFMADD231PS(Y5, Y6, Y10)
		VMOVUPS(Mem{Base: RSP}.Offset(576), Y5)
		VFMADD213PS(Mem{Base: RSP}.Offset(608), Y4, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(512), Y13)
		VFMADD213PS(Mem{Base: RSP}.Offset(544), Y4, Y13)
		VMULPS(Y0, Y0, Y14)
		VFMADD132PS(Mem{Base: RSP}.Offset(480), Y13, Y14)
		VFMADD231PS(Y5, Y6, Y14)
		VFMADD231PS(Y10, Y0, Y14)
		VMULPS(Y4, Y6, Y0)
		VMULPS(Y0, Y14, Y0)
		VMOVDQU(Y12, Mem{Base: RSP}.Offset(832))
		VPSRLD(Imm(23), Y12, Y5)
		VPOR(Mem{Base: RSP}.Offset(448), Y5, Y5)
		VADDPS(Mem{Base: RSP}.Offset(416), Y5, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y3)
		VANDPS(Y3, Y1, Y1)
		VADDPS(Y1, Y5, Y5)
		VMULPS(Y5, Y11, Y1)
		VROUNDPS(Imm(8), Y1, Y1)
		VFNMADD213PS(Y1, Y11, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(384), Y14)
		VMOVAPS(Y14, Y10)
		VFMADD213PS(Y4, Y6, Y10)
		VADDPS(Y0, Y10, Y10)
		VMOVUPS(Mem{Base: RSP}.Offset(352), Y12)
		VMULPS(Y4, Y12, Y13)
		VMULPS(Y6, Y14, Y14)
		VFMADD231PS(Y13, Y4, Y14)
		VSUBPS(Y4, Y10, Y4)
		VFMADD231PS(Y6, Y12, Y4)
		VMOVUPS(Mem{Base: RSP}.Offset(288), Y15)
		VMULPS(Y15, Y11, Y6)
		VMULPS(Y6, Y10, Y6)
		VROUNDPS(Imm(8), Y6, Y6)
		VMULPS(Mem{Base: RSP}.Offset(256), Y6, Y13)
		VFMADD231PS(Y10, Y11, Y13)
		VFNMADD231PS(Mem{Base: RSP}.Offset(224), Y6, Y13)
		VSUBPS(Y0, Y14, Y0)
		VADDPS(Y4, Y0, Y0)
		VMOVUPS(Mem{Base: RSP}.Offset(192), Y10)
		VMULPS(Y5, Y10, Y4)
		VFNMADD231PS(Y0, Y11, Y4)
		VADDPS(Y4, Y13, Y0)
		VMULPS(Y0, Y15, Y4)
		VROUNDPS(Imm(8), Y4, Y4)
		VFMADD231PS(Y10, Y4, Y0)
		VMULPS(Y0, Y0, Y5)
		VMULPS(Y5, Y5, Y10)
		VMOVUPS(Mem{Base: RSP}.Offset(64), Y13)
		VFMADD213PS(Mem{Base: RSP}.Offset(96), Y0, Y13)
		VMOVUPS(Mem{Base: RSP}.Offset(32), Y14)
		VFMADD213PS(Y12, Y0, Y14)
		VFMADD231PS(Y13, Y10, Y14)
		VMOVUPS(Mem{Base: RSP}.Offset(128), Y10)
		VFMADD213PS(Mem{Base: RSP}.Offset(160), Y0, Y10)
		VFMADD231PS(Y10, Y5, Y14)
		VADDPS(Y3, Y0, Y10)
		VFMADD231PS(Y14, Y5, Y10)
		VADDPS(Y1, Y6, Y0)
		VADDPS(Y4, Y0, Y14)
		VCVTPS2DQ(Y14, Y4)
		VPSRLD(Imm(23), Y10, Y0)
		VPAND(data.Offset(164), Y0, Y0)
		VPADDD(Y4, Y0, Y0)
		VPCMPGTD(Mem{Base: RSP}, Y0, Y1)
		VMOVUPS(Mem{Base: RSP}.Offset(-32), Y3)
		VCMPPS(Imm(1), Y14, Y3, Y5)
		VPOR(Y5, Y1, Y1)
		VMOVDQU(Mem{Base: RSP}.Offset(-64), Y3)
		VPCMPGTD(Y0, Y3, Y0)
		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y14, Y5)
		VPOR(Y5, Y0, Y0)
		VPSLLD(Imm(23), Y4, Y4)
		VPADDD(Y4, Y10, Y6)
		VPOR(Y1, Y0, Y4)
		VTESTPS(Y4, Y4)
		JNE(LabelRef("LBB8_3"))
		VPCMPEQD(Y15, Y15, Y15)
		VMOVAPS(Y2, Y12)
		JMP(LabelRef("LBB8_5"))
	}

	Label("LBB8_3")
	{
		VPANDN(Y6, Y0, Y0)
		VMOVAPS(Y2, Y12)
		VBLENDVPS(Y1, Y2, Y0, Y6)
		VPCMPEQD(Y15, Y15, Y15)
	}

	Label("LBB8_5")
	{
		VPAND(Y8, Y9, Y5)
		VPCMPEQD(Y7, Y5, Y4)
		VCMPPS(Imm(1), Y7, Y11, Y13)
		VCMPPS(Imm(0), Y7, Y11, Y1)
		VANDPS(Mem{Base: RSP}.Offset(-128), Y1, Y0)
		VBLENDVPS(Y13, Y12, Y0, Y0)
		VBLENDVPS(Y4, Y0, Y6, Y6)
		VMOVMSKPS(Y9, ECX)
		TESTL(ECX, ECX)
		JNE(LabelRef("LBB8_7"))
		VXORPS(X10, X10, X10)
		JMP(LabelRef("LBB8_8"))
	}

	Label("LBB8_7")
	{
		VROUNDPS(Imm(8), Y11, Y0)
		VCMPPS(Imm(0), Y0, Y11, Y0)
		VCVTPS2DQ(Y11, Y10)
		VPSLLD(Imm(31), Y10, Y10)
		VPOR(Y6, Y10, Y12)
		VPXOR(X3, X3, X3)
		VCMPPS(Imm(0), Y3, Y9, Y7)
		VBROADCASTSS(data.Offset(120), Y3)
		VBLENDVPS(Y7, Y6, Y3, Y3)
		VBLENDVPS(Y0, Y12, Y3, Y3)
		VMOVAPS(Y2, Y12)
		VPSRAD(Imm(31), Y9, Y7)
		VBLENDVPS(Y7, Y3, Y6, Y6)
		VANDPS(Y0, Y10, Y10)
	}

	Label("LBB8_8")
	{
		VPCMPEQD(Y5, Y8, Y0)
		VPXOR(Y0, Y15, Y5)
		VANDPS(Y8, Y11, Y0)
		VANDPS(Y8, Y14, Y3)
		VPCMPEQD(Y3, Y8, Y14)
		VPXOR(Y15, Y14, Y3)
		VPCMPEQD(Y0, Y8, Y0)
		VPANDN(Y5, Y0, Y7)
		VPOR(Y4, Y3, Y3)
		VPAND(Y7, Y3, Y3)
		VTESTPS(Y15, Y3)
		JAE(LabelRef("LBB8_10"))
		VPXOR(X7, X7, X7)
		VMOVUPS(Y6, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RDX)
		JB(LabelRef("LBB8_2"))
	}

	Label("LBB8_12")
	{
		ADDQ(I32(872), RSP)
		VZEROUPPER()
		RET()
	}
}

func genLog10_Len8x_F32() {

	data := GLOBL("dataLog10Len8xF32", RODATA|NOPTR)
	DATA(0, U32(0x00800000))
	DATA(4, U32(2155872255))
	DATA(8, U32(1056964608))
	DATA(12, U32(4294967169))
	DATA(16, U32(0x3f800000))
	DATA(20, U32(0x3f3504f3))
	DATA(24, U32(0xbf800000))
	DATA(28, U32(0x3d9021bb))
	DATA(32, U32(0xbdebd1b8))
	DATA(36, U32(0x3def251a))
	DATA(40, U32(0xbdfe5d4f))
	DATA(44, U32(0x3e11e9bf))
	DATA(48, U32(0xbe2aae50))
	DATA(52, U32(0x3e4cceac))
	DATA(56, U32(0xbe7ffffc))
	DATA(60, U32(0x3eaaaaaa))
	DATA(64, U32(0x3f317218))
	DATA(68, U32(0xbf000000))
	DATA(72, U32(0x3ede5bd9))
	DATA(76, U64(0x0))
	DATA(84, U64(0x0))
	DATA(92, U64(0x0))
	DATA(100, U64(0x0))

	TEXT("Log10_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	SUBQ(Imm(136), RSP)
	TESTQ(RSI, RSI)
	JE(LabelRef("LBB8_3"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSS(data.Offset(12), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSS(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VBROADCASTSS(data.Offset(28), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(32), Y9)
	VBROADCASTSS(data.Offset(36), Y10)
	VBROADCASTSS(data.Offset(40), Y11)
	VBROADCASTSS(data.Offset(44), Y12)
	VBROADCASTSS(data.Offset(48), Y13)
	VBROADCASTSS(data.Offset(52), Y14)
	VBROADCASTSS(data.Offset(56), Y15)
	VBROADCASTSS(data.Offset(60), Y0)
	VBROADCASTSS(data.Offset(64), Y1)
	VBROADCASTSS(data.Offset(68), Y2)
	VBROADCASTSS(data.Offset(72), Y3)

	Label("LBB8_2")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4)
		VMAXPS(Mem{Base: RSP}, Y4, Y5)
		VPSRLD(Imm(23), Y5, Y6)
		VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6)
		VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5)
		VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5)
		VCVTDQ2PS(Y6, Y6)
		VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7)
		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8)
		VBLENDVPS(Y8, Y6, Y7, Y6)
		VANDPS(Y5, Y8, Y7)
		VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5)
		VADDPS(Y7, Y5, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7)
		VFMADD213PS(Y9, Y5, Y7)
		VFMADD213PS(Y10, Y5, Y7)
		VFMADD213PS(Y11, Y5, Y7)
		VFMADD213PS(Y12, Y5, Y7)
		VFMADD213PS(Y13, Y5, Y7)
		VFMADD213PS(Y14, Y5, Y7)
		VFMADD213PS(Y15, Y5, Y7)
		VFMADD213PS(Y0, Y5, Y7)
		VFMADD213PS(Y2, Y5, Y7)
		VFMADD213PS(Y5, Y1, Y6)
		VMULPS(Y5, Y5, Y5)
		VFMADD231PS(Y7, Y5, Y6)
		VCMPPS(Imm(2), data.Offset(76), Y4, Y4)
		VMULPS(Y3, Y6, Y5)
		VORPS(Y5, Y4, Y4)
		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RSI)
		JB(LabelRef("LBB8_2"))
	}

	Label("LBB8_3")
	{
		ADDQ(Imm(136), RSP)
		VZEROUPPER()
		RET()
	}
}

func genLog2_Len8x_F32() {

	data := GLOBL("dataLog2Len8xF32", RODATA|NOPTR)
	DATA(0, U32(0x00800000))
	DATA(4, U32(2155872255))
	DATA(8, U32(1056964608))
	DATA(12, U32(4294967169))
	DATA(16, U32(0x3f800000))
	DATA(20, U32(0x3f3504f3))
	DATA(24, U32(0xbf800000))
	DATA(28, U32(0x3d9021bb))
	DATA(32, U32(0xbdebd1b8))
	DATA(36, U32(0x3def251a))
	DATA(40, U32(0xbdfe5d4f))
	DATA(44, U32(0x3e11e9bf))
	DATA(48, U32(0xbe2aae50))
	DATA(52, U32(0x3e4cceac))
	DATA(56, U32(0xbe7ffffc))
	DATA(60, U32(0x3eaaaaaa))
	DATA(64, U32(0x3f317218))
	DATA(68, U32(0xbf000000))
	DATA(72, U32(0x3fb8aa3b))
	DATA(76, U64(0x0))
	DATA(84, U64(0x0))
	DATA(92, U64(0x0))
	DATA(100, U64(0x0))

	TEXT("Log2_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	SUBQ(Imm(136), RSP)
	TESTQ(RSI, RSI)
	JE(LabelRef("LBB9_3"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(96))
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSS(data.Offset(12), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSS(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VBROADCASTSS(data.Offset(28), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(32), Y9)
	VBROADCASTSS(data.Offset(36), Y10)
	VBROADCASTSS(data.Offset(40), Y11)
	VBROADCASTSS(data.Offset(44), Y12)
	VBROADCASTSS(data.Offset(48), Y13)
	VBROADCASTSS(data.Offset(52), Y14)
	VBROADCASTSS(data.Offset(56), Y15)
	VBROADCASTSS(data.Offset(60), Y0)
	VBROADCASTSS(data.Offset(64), Y1)
	VBROADCASTSS(data.Offset(68), Y2)
	VBROADCASTSS(data.Offset(72), Y3)

	Label("LBB9_2")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y4)
		VMAXPS(Mem{Base: RSP}, Y4, Y5)
		VPSRLD(Imm(23), Y5, Y6)
		VPADDD(Mem{Base: RSP}.Offset(32), Y6, Y6)
		VANDPS(Mem{Base: RSP}.Offset(96), Y5, Y5)
		VORPS(Mem{Base: RSP}.Offset(64), Y5, Y5)
		VCVTDQ2PS(Y6, Y6)
		VADDPS(Mem{Base: RSP}.Offset(-32), Y6, Y7)
		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-64), Y5, Y8)
		VBLENDVPS(Y8, Y6, Y7, Y6)
		VANDPS(Y5, Y8, Y7)
		VADDPS(Mem{Base: RSP}.Offset(-96), Y5, Y5)
		VADDPS(Y7, Y5, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y7)
		VFMADD213PS(Y9, Y5, Y7)
		VFMADD213PS(Y10, Y5, Y7)
		VFMADD213PS(Y11, Y5, Y7)
		VFMADD213PS(Y12, Y5, Y7)
		VFMADD213PS(Y13, Y5, Y7)
		VFMADD213PS(Y14, Y5, Y7)
		VFMADD213PS(Y15, Y5, Y7)
		VFMADD213PS(Y0, Y5, Y7)
		VFMADD213PS(Y2, Y5, Y7)
		VFMADD213PS(Y5, Y1, Y6)
		VMULPS(Y5, Y5, Y5)
		VFMADD231PS(Y7, Y5, Y6)
		VCMPPS(Imm(2), data.Offset(76), Y4, Y4)
		VMULPS(Y3, Y6, Y5)
		VORPS(Y5, Y4, Y4)
		VMOVUPS(Y4, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RSI)
		JB(LabelRef("LBB9_2"))
	}

	Label("LBB9_3")
	{
		ADDQ(Imm(136), RSP)
		VZEROUPPER()
		RET()
	}
}

func genLog_Len8x_F32() {

	data := GLOBL("dataLogLen8xF32", RODATA|NOPTR)
	DATA(0, U32(0x00800000))
	DATA(4, U32(2155872255))
	DATA(8, U32(1056964608))
	DATA(12, U32(4294967169))
	DATA(16, U32(0x3f800000))
	DATA(20, U32(0x3f3504f3))
	DATA(24, U32(0xbf800000))
	DATA(28, U32(0x3d9021bb))
	DATA(32, U32(0xbdebd1b8))
	DATA(36, U32(0x3def251a))
	DATA(40, U32(0xbdfe5d4f))
	DATA(44, U32(0x3e11e9bf))
	DATA(48, U32(0xbe2aae50))
	DATA(52, U32(0x3e4cceac))
	DATA(56, U32(0xbe7ffffc))
	DATA(60, U32(0x3eaaaaaa))
	DATA(64, U32(0x3f317218))
	DATA(68, U32(0xbf000000))
	DATA(72, U64(0x0))
	DATA(80, U64(0x0))
	DATA(88, U64(0x0))
	DATA(96, U64(0x0))

	TEXT("Log_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	SUBQ(Imm(104), RSP)
	TESTQ(RSI, RSI)
	JE(LabelRef("LBB10_3"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSS(data.Offset(12), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VBROADCASTSS(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(28), Y8)
	VBROADCASTSS(data.Offset(32), Y9)
	VBROADCASTSS(data.Offset(36), Y10)
	VBROADCASTSS(data.Offset(40), Y11)
	VBROADCASTSS(data.Offset(44), Y12)
	VBROADCASTSS(data.Offset(48), Y13)
	VBROADCASTSS(data.Offset(52), Y14)
	VBROADCASTSS(data.Offset(56), Y15)
	VBROADCASTSS(data.Offset(60), Y0)
	VBROADCASTSS(data.Offset(64), Y1)
	VBROADCASTSS(data.Offset(68), Y2)

	Label("LBB10_2")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y3)
		VMAXPS(Mem{Base: RSP}.Offset(64), Y3, Y4)
		VPSRLD(Imm(23), Y4, Y5)
		VPADDD(Mem{Base: RSP}.Offset(-32), Y5, Y5)
		VANDPS(Mem{Base: RSP}.Offset(32), Y4, Y4)
		VORPS(Mem{Base: RSP}, Y4, Y4)
		VCVTDQ2PS(Y5, Y5)
		VADDPS(Mem{Base: RSP}.Offset(-64), Y5, Y6)
		VCMPPS(Imm(1), Mem{Base: RSP}.Offset(-96), Y4, Y7)
		VBLENDVPS(Y7, Y5, Y6, Y5)
		VANDPS(Y4, Y7, Y6)
		VADDPS(Mem{Base: RSP}.Offset(-128), Y4, Y4)
		VADDPS(Y6, Y4, Y4)
		VMOVAPS(Y8, Y6)
		VFMADD213PS(Y9, Y4, Y6)
		VFMADD213PS(Y10, Y4, Y6)
		VFMADD213PS(Y11, Y4, Y6)
		VFMADD213PS(Y12, Y4, Y6)
		VFMADD213PS(Y13, Y4, Y6)
		VFMADD213PS(Y14, Y4, Y6)
		VFMADD213PS(Y15, Y4, Y6)
		VFMADD213PS(Y0, Y4, Y6)
		VFMADD213PS(Y2, Y4, Y6)
		VFMADD213PS(Y4, Y1, Y5)
		VMULPS(Y4, Y4, Y4)
		VFMADD231PS(Y6, Y4, Y5)
		VCMPPS(Imm(2), data.Offset(72), Y3, Y3)
		VORPS(Y5, Y3, Y3)
		VMOVUPS(Y3, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RSI)
		JB(LabelRef("LBB10_2"))
	}

	Label("LBB10_3")
	{
		ADDQ(Imm(104), RSP)
		VZEROUPPER()
		RET()
	}
}

func genExp_Len8x_F32() {

	data := GLOBL("dataExpLen8xF32", RODATA|NOPTR)
	DATA(0, U32(0x42b17218))
	DATA(4, U32(0xc2ce8ed0))
	DATA(8, U32(0x3f000000))
	DATA(12, U32(0x3fb8aa3b))
	DATA(16, U32(0xbf318000))
	DATA(20, U32(0x395e8083))
	DATA(24, U32(1065353216))
	DATA(28, U32(0x3ab743ce))
	DATA(32, U32(0x39506967))
	DATA(36, U32(0x3c088908))
	DATA(40, U32(0x3d2aa9c1))
	DATA(44, U32(0x3e2aaaaa))
	DATA(48, U32(0x7f7fffff))

	TEXT("Exp_Len8x_AVX2_F32", NOSPLIT, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	TESTQ(RSI, RSI)
	JE(LabelRef("LBB11_3"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-40))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-72))
	VBROADCASTSS(data.Offset(8), Y2)
	VBROADCASTSS(data.Offset(12), Y3)
	VBROADCASTSS(data.Offset(16), Y4)
	VBROADCASTSS(data.Offset(20), Y5)
	VPBROADCASTD(data.Offset(24), Y6)
	VBROADCASTSS(data.Offset(28), Y7)
	VBROADCASTSS(data.Offset(32), Y1)
	VBROADCASTSS(data.Offset(36), Y9)
	VBROADCASTSS(data.Offset(40), Y10)
	VBROADCASTSS(data.Offset(44), Y11)
	VBROADCASTSS(data.Offset(48), Y12)

	Label("LBB11_2")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RAX, 4), Y13)
		VMOVAPS(Y3, Y14)
		VFMADD213PS(Y2, Y13, Y14)
		VROUNDPS(Imm(1), Y14, Y14)
		VMOVAPS(Y4, Y15)
		VFMADD213PS(Y13, Y14, Y15)
		VFMADD231PS(Y5, Y14, Y15)
		VMULPS(Y15, Y15, Y0)
		VMOVAPS(Y1, Y8)
		VFMADD213PS(Y7, Y15, Y8)
		VFMADD213PS(Y9, Y15, Y8)
		VFMADD213PS(Y10, Y15, Y8)
		VFMADD213PS(Y11, Y15, Y8)
		VFMADD213PS(Y2, Y15, Y8)
		VFMADD213PS(Y15, Y0, Y8)
		VCVTTPS2DQ(Y14, Y0)
		VPSLLD(Imm(23), Y0, Y0)
		VPADDD(Y6, Y0, Y0)
		VFMADD213PS(Y0, Y0, Y8)
		VMOVUPS(Mem{Base: RSP}.Offset(-40), Y0)
		VCMPPS(Imm(1), Y13, Y0, Y0)
		VBLENDVPS(Y0, Y12, Y8, Y0)
		VMOVUPS(Mem{Base: RSP}.Offset(-72), Y8)
		VCMPPS(Imm(2), Y13, Y8, Y8)
		VANDPS(Y0, Y8, Y0)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, RSI)
		JB(LabelRef("LBB11_2"))
	}

	Label("LBB11_3")
	{
		VZEROUPPER()
		RET()
	}
}

func genSin_F32() {

	data := GLOBL("dataSinF32", RODATA|NOPTR)
	DATA(0, U32(2147483647))
	DATA(4, U32(0x3fa2f983))
	DATA(8, U32(4294967294))
	DATA(12, U32(2))
	DATA(16, U32(0xbf490fdb))
	DATA(20, U32(2147483648))
	DATA(24, U32(0x37ccf5ce))
	DATA(28, U32(0xbab6061a))
	DATA(32, U32(0x3d2aaaa5))
	DATA(36, U32(0xbf000000))
	DATA(40, U32(0x3f800000))
	DATA(44, U32(0xb94ca1f9))
	DATA(48, U32(0x3c08839e))
	DATA(52, U32(0xbe2aaaa3))
	DATA(56, U32(0x4b7fffff))
	DATA(60, U64(0xffffffffffffffff))
	DATA(68, U64(0xffffffffffffffff))
	DATA(76, U64(0xffffffffffffffff))
	DATA(84, U64(0xffffffffffffffff))
	DATA(92, U64(0x0))
	DATA(100, U64(0x0))
	DATA(108, U64(0x0))
	DATA(116, U64(0x0))

	TEXT("Sin_AVX2_F32", 0, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	PUSHQ(RAX)
	MOVQ(RSI, RAX)
	ANDQ(I32(-8), RAX)
	JE(LabelRef("LBB12_3"))
	XORL(ECX, ECX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VPBROADCASTD(data.Offset(12), Y4)
	VPBROADCASTD(data.Offset(16), Y0)
	VMOVDQU(Y0, Mem{Base: RSP}.Offset(-128))
	VPBROADCASTD(data.Offset(20), Y7)
	VBROADCASTSS(data.Offset(24), Y8)
	VBROADCASTSS(data.Offset(28), Y9)
	VBROADCASTSS(data.Offset(32), Y10)
	VBROADCASTSS(data.Offset(36), Y11)
	VBROADCASTSS(data.Offset(40), Y12)
	VBROADCASTSS(data.Offset(44), Y3)
	VBROADCASTSS(data.Offset(48), Y14)
	VBROADCASTSS(data.Offset(52), Y15)

	Label("LBB12_2")
	{
		VMOVUPS(Mem{Base: RDI}.Idx(RCX, 4), Y2)
		VANDPS(Mem{Base: RSP}.Offset(-32), Y2, Y5)
		VMULPS(Mem{Base: RSP}.Offset(-64), Y5, Y0)
		VCVTTPS2DQ(Y0, Y0)
		VPSUBD(data.Offset(60), Y0, Y0)
		VPAND(Mem{Base: RSP}.Offset(-96), Y0, Y1)
		VCVTDQ2PS(Y1, Y1)
		VFMADD132PS(Mem{Base: RSP}.Offset(-128), Y5, Y1)
		VMULPS(Y1, Y1, Y5)
		VMOVAPS(Y3, Y13)
		VFMADD213PS(Y14, Y5, Y13)
		VFMADD213PS(Y15, Y5, Y13)
		VMULPS(Y1, Y5, Y6)
		VFMADD213PS(Y1, Y13, Y6)
		VPSLLD(Imm(29), Y0, Y1)
		VPAND(Y4, Y0, Y0)
		VPXOR(Y2, Y1, Y1)
		VMOVAPS(Y8, Y2)
		VFMADD213PS(Y9, Y5, Y2)
		VFMADD213PS(Y10, Y5, Y2)
		VFMADD213PS(Y11, Y5, Y2)
		VFMADD213PS(Y12, Y5, Y2)
		VPCMPEQD(Y4, Y0, Y5)
		VANDPS(Y5, Y2, Y2)
		VPCMPEQD(data.Offset(92), Y0, Y0)
		VANDPS(Y0, Y6, Y0)
		VADDPS(Y2, Y0, Y0)
		VPAND(Y7, Y1, Y1)
		VPXOR(Y0, Y1, Y0)
		VMOVDQU(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		ADDQ(Imm(8), RCX)
		CMPQ(RCX, RAX)
		JB(LabelRef("LBB12_2"))
	}

	Label("LBB12_3")
	{
		CMPQ(RAX, RSI)
		JAE(LabelRef("LBB12_14"))
		VBROADCASTSS(data.Offset(20), X0)
		VPXOR(X1, X1, X1)
		VMOVSS(data.Offset(56), X2)
		VMOVSS(data.Offset(40), X9)
		VMOVSS(data.Offset(16), X10)
		VMOVSS(data.Offset(24), X12)
		VMOVSS(data.Offset(28), X11)
		VMOVSS(data.Offset(32), X13)
		VMOVSS(data.Offset(36), X14)
		VMOVSS(data.Offset(44), X8)
		VMOVSS(data.Offset(48), X15)
		VMOVSS(data.Offset(52), X6)
		JMP(LabelRef("LBB12_5"))
	}

	Label("LBB12_13")
	{
		ADDQ(Imm(1), RAX)
		CMPQ(RAX, RSI)
		JAE(LabelRef("LBB12_14"))
	}

	Label("LBB12_5")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X4)
		VXORPS(X0, X4, X3)
		VCMPSS(Imm(1), X1, X4, X5)
		VBLENDVPS(X5, X3, X4, X3)
		VUCOMISS(X2, X3)
		JA(LabelRef("LBB12_13"))
		VUCOMISS(X1, X4)
		SETCS(R8B)
		VMULSS(data.Offset(4), X3, X4)
		VCVTTSS2SI(X4, EDX)
		VROUNDSS(Imm(11), X4, X4, X4)
		MOVL(EDX, ECX)
		ANDL(Imm(1), ECX)
		JE(LabelRef("LBB12_8"))
		VADDSS(X4, X9, X4)
	}

	Label("LBB12_8")
	{
		ADDL(EDX, ECX)
		ANDL(Imm(7), ECX)
		LEAL(Mem{Base: RCX}.Offset(-4), EDX)
		CMPL(ECX, Imm(4))
		SETCC(R9B)
		CMOVLLT(ECX, EDX)
		VFMADD231SS(X10, X4, X3)
		VMULSS(X3, X3, X4)
		VMOVAPS(X12, X7)
		VFMADD213SS(X11, X4, X7)
		VFMADD213SS(X13, X4, X7)
		VFMADD213SS(X14, X4, X7)
		VMOVAPS(X8, X5)
		VFMADD213SS(X15, X4, X5)
		VFMADD213SS(X6, X4, X5)
		ADDL(I32(-1), EDX)
		CMPL(EDX, Imm(2))
		JB(LabelRef("LBB12_9"))
		VMULSS(X3, X4, X4)
		VFMADD213SS(X3, X4, X5)
		VMOVAPS(X5, X4)
		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
		CMPB(R8B, R9B)
		JE(LabelRef("LBB12_13"))
		JMP(LabelRef("LBB12_12"))
	}

	Label("LBB12_9")
	{
		VFMADD213SS(X9, X7, X4)
		VMOVSS(X4, Mem{Base: RDI}.Idx(RAX, 4))
		CMPB(R8B, R9B)
		JE(LabelRef("LBB12_13"))
	}

	Label("LBB12_12")
	{
		VXORPS(X0, X4, X3)
		VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4))
		JMP(LabelRef("LBB12_13"))
	}

	Label("LBB12_14")
	{
		POPQ(RAX)
		VZEROUPPER()
		RET()
	}
}

func genCos_F32() {

	data := GLOBL("dataCosF32", RODATA|NOPTR)
	DATA(0, U32(2147483647))
	DATA(4, U32(0x3fa2f983))
	DATA(8, U32(4294967294))
	DATA(12, U32(2))
	DATA(16, U32(0xbf490fdb))
	DATA(20, U32(3221225472))
	DATA(24, U32(0x37ccf5ce))
	DATA(28, U32(0xbab6061a))
	DATA(32, U32(0x3d2aaaa5))
	DATA(36, U32(0xbf000000))
	DATA(40, U32(0x3f800000))
	DATA(44, U32(0xb94ca1f9))
	DATA(48, U32(0x3c08839e))
	DATA(52, U32(0xbe2aaaa3))
	DATA(56, U32(2147483648))
	DATA(60, U32(0x4b7fffff))
	DATA(64, U64(0xffffffffffffffff))
	DATA(72, U64(0xffffffffffffffff))
	DATA(80, U64(0xffffffffffffffff))
	DATA(88, U64(0xffffffffffffffff))
	DATA(96, U64(0x0))
	DATA(104, U64(0x0))
	DATA(112, U64(0x0))
	DATA(120, U64(0x0))

	TEXT("Cos_AVX2_F32", NOSPLIT, "func(x []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("x").Len(), RSI)

	SUBQ(Imm(72), RSP)
	MOVQ(RSI, RAX)
	ANDQ(I32(-8), RAX)
	JE(LabelRef("LBB13_3"))
	XORL(ECX, ECX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VPBROADCASTD(data.Offset(12), Y4)
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VBROADCASTSS(data.Offset(24), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(28), Y9)
	VBROADCASTSS(data.Offset(32), Y10)
	VBROADCASTSS(data.Offset(36), Y6)
	VBROADCASTSS(data.Offset(40), Y12)
	VBROADCASTSS(data.Offset(44), Y13)
	VBROADCASTSS(data.Offset(48), Y14)
	VBROADCASTSS(data.Offset(52), Y15)
	VPBROADCASTD(data.Offset(56), Y2)

	Label("LBB13_2")
	{
		VMOVUPS(Mem{Base: RSP}.Offset(32), Y0)
		VANDPS(Mem{Base: RDI}.Idx(RCX, 4), Y0, Y5)
		VMULPS(Mem{Base: RSP}, Y5, Y0)
		VCVTTPS2DQ(Y0, Y0)
		VPSUBD(data.Offset(64), Y0, Y0)
		VPAND(Mem{Base: RSP}.Offset(-32), Y0, Y1)
		VCVTDQ2PS(Y1, Y3)
		VFMADD132PS(Mem{Base: RSP}.Offset(-64), Y5, Y3)
		VMULPS(Y3, Y3, Y5)
		VMOVUPS(Mem{Base: RSP}.Offset(-128), Y8)
		VFMADD213PS(Y9, Y5, Y8)
		VFMADD213PS(Y10, Y5, Y8)
		VMULPS(Y5, Y5, Y7)
		VMOVAPS(Y6, Y11)
		VFMADD213PS(Y12, Y5, Y11)
		VFMADD231PS(Y7, Y8, Y11)
		VMOVAPS(Y13, Y7)
		VFMADD213PS(Y14, Y5, Y7)
		VFMADD213PS(Y15, Y5, Y7)
		VMULPS(Y3, Y5, Y5)
		VFMADD213PS(Y3, Y7, Y5)
		VPAND(Y4, Y0, Y0)
		VPCMPEQD(Y4, Y0, Y3)
		VPCMPEQD(data.Offset(96), Y0, Y0)
		VANDPS(Y0, Y5, Y0)
		VANDPS(Y3, Y11, Y3)
		VADDPS(Y3, Y0, Y0)
		VADDPS(Y5, Y11, Y3)
		VSUBPS(Y0, Y3, Y0)
		VPSLLD(Imm(29), Y1, Y1)
		VPADDD(Mem{Base: RSP}.Offset(-96), Y1, Y1)
		VPAND(Y2, Y1, Y1)
		VPXOR(Y2, Y1, Y1)
		VXORPS(Y1, Y0, Y0)
		VMOVUPS(Y0, Mem{Base: RDI}.Idx(RCX, 4))
		ADDQ(Imm(8), RCX)
		CMPQ(RCX, RAX)
		JB(LabelRef("LBB13_2"))
	}

	Label("LBB13_3")
	{
		CMPQ(RAX, RSI)
		JAE(LabelRef("LBB13_14"))
		VBROADCASTSS(data.Offset(56), X0)
		VXORPS(X1, X1, X1)
		VMOVSS(data.Offset(60), X2)
		VMOVSS(data.Offset(40), X9)
		VMOVSS(data.Offset(16), X10)
		VMOVSS(data.Offset(24), X8)
		VMOVSS(data.Offset(28), X11)
		VMOVSS(data.Offset(32), X13)
		VMOVSS(data.Offset(36), X14)
		VMOVSS(data.Offset(44), X7)
		VMOVSS(data.Offset(48), X15)
		VMOVSS(data.Offset(52), X6)
		JMP(LabelRef("LBB13_5"))
	}

	Label("LBB13_13")
	{
		ADDQ(Imm(1), RAX)
		CMPQ(RAX, RSI)
		JAE(LabelRef("LBB13_14"))
	}

	Label("LBB13_5")
	{
		VMOVSS(Mem{Base: RDI}.Idx(RAX, 4), X3)
		VXORPS(X0, X3, X4)
		VCMPSS(Imm(1), X1, X3, X5)
		VBLENDVPS(X5, X4, X3, X3)
		VUCOMISS(X2, X3)
		JA(LabelRef("LBB13_13"))
		VMULSS(data.Offset(4), X3, X4)
		VCVTTSS2SI(X4, EDX)
		VROUNDSS(Imm(11), X4, X4, X4)
		MOVL(EDX, ECX)
		ANDL(Imm(1), ECX)
		JE(LabelRef("LBB13_8"))
		VADDSS(X4, X9, X4)
	}

	Label("LBB13_8")
	{
		ADDL(EDX, ECX)
		ANDL(Imm(7), ECX)
		LEAL(Mem{Base: RCX}.Offset(-4), EDX)
		CMPL(ECX, Imm(4))
		CMOVLLT(ECX, EDX)
		SETCC(R8B)
		CMPL(EDX, Imm(2))
		SETCC(CL)
		VFMADD231SS(X10, X4, X3)
		VMULSS(X3, X3, X4)
		VMOVAPS(X8, X12)
		VFMADD213SS(X11, X4, X12)
		VFMADD213SS(X13, X4, X12)
		VFMADD213SS(X14, X4, X12)
		VMOVAPS(X7, X5)
		VFMADD213SS(X15, X4, X5)
		VFMADD213SS(X6, X4, X5)
		ADDL(I32(-1), EDX)
		CMPL(EDX, Imm(2))
		JB(LabelRef("LBB13_9"))
		VFMADD213SS(X9, X12, X4)
		VMOVAPS(X4, X5)
		VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4))
		CMPB(R8B, CL)
		JE(LabelRef("LBB13_13"))
		JMP(LabelRef("LBB13_12"))
	}

	Label("LBB13_9")
	{
		VMULSS(X3, X4, X4)
		VFMADD213SS(X3, X4, X5)
		VMOVSS(X5, Mem{Base: RDI}.Idx(RAX, 4))
		CMPB(R8B, CL)
		JE(LabelRef("LBB13_13"))
	}

	Label("LBB13_12")
	{
		VXORPS(X0, X5, X3)
		VMOVSS(X3, Mem{Base: RDI}.Idx(RAX, 4))
		JMP(LabelRef("LBB13_13"))
	}

	Label("LBB13_14")
	{
		ADDQ(Imm(72), RSP)
		VZEROUPPER()
		RET()
	}
}

func genSinCos_F32() {

	data := GLOBL("dataSinCosF32", RODATA|NOPTR)
	DATA(0, U32(2147483647))
	DATA(4, U32(0x3fa2f983))
	DATA(8, U32(4294967294))
	DATA(12, U32(2))
	DATA(16, U32(0xbf490fdb))
	DATA(20, U32(3221225472))
	DATA(24, U32(2147483648))
	DATA(28, U32(0x37ccf5ce))
	DATA(32, U32(0xbab6061a))
	DATA(36, U32(0x3d2aaaa5))
	DATA(40, U32(0xbf000000))
	DATA(44, U32(0x3f800000))
	DATA(48, U32(0xb94ca1f9))
	DATA(52, U32(0x3c08839e))
	DATA(56, U32(0xbe2aaaa3))
	DATA(60, U32(0x4b7fffff))
	DATA(64, U64(0xffffffffffffffff))
	DATA(72, U64(0xffffffffffffffff))
	DATA(80, U64(0xffffffffffffffff))
	DATA(88, U64(0xffffffffffffffff))
	DATA(96, U64(0x0))
	DATA(104, U64(0x0))
	DATA(112, U64(0x0))
	DATA(120, U64(0x0))

	TEXT("SinCos_AVX2_F32", 0, "func(x, y, z []float32)")
	Pragma("noescape")
	Load(Param("x").Base(), RDI)
	Load(Param("y").Base(), RSI)
	Load(Param("z").Base(), RDX)
	Load(Param("x").Len(), RCX)

	PUSHQ(RBX)
	SUBQ(Imm(96), RSP)
	MOVQ(RCX, R8)
	ANDQ(I32(-8), R8)
	JE(LabelRef("LBB14_3"))
	XORL(EAX, EAX)
	VBROADCASTSS(data.Offset(0), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(64))
	VBROADCASTSS(data.Offset(4), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(32))
	VBROADCASTSS(data.Offset(8), Y0)
	VMOVUPS(Y0, Mem{Base: RSP})
	VPBROADCASTD(data.Offset(12), Y4)
	VBROADCASTSS(data.Offset(16), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-32))
	VBROADCASTSS(data.Offset(20), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-64))
	VPBROADCASTD(data.Offset(24), Y8)
	VBROADCASTSS(data.Offset(28), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-96))
	VBROADCASTSS(data.Offset(32), Y0)
	VMOVUPS(Y0, Mem{Base: RSP}.Offset(-128))
	VBROADCASTSS(data.Offset(36), Y11)
	VBROADCASTSS(data.Offset(40), Y10)
	VBROADCASTSS(data.Offset(44), Y13)
	VBROADCASTSS(data.Offset(48), Y14)
	VBROADCASTSS(data.Offset(52), Y15)
	VBROADCASTSS(data.Offset(56), Y2)

	Label("LBB14_2")
	{
		VMOVUPS(Mem{Base: RDX}.Idx(RAX, 4), Y5)
		VANDPS(Mem{Base: RSP}.Offset(64), Y5, Y1)
		VMULPS(Mem{Base: RSP}.Offset(32), Y1, Y0)
		VCVTTPS2DQ(Y0, Y0)
		VPSUBD(data.Offset(64), Y0, Y3)
		VPAND(Mem{Base: RSP}, Y3, Y0)
		VCVTDQ2PS(Y0, Y6)
		VFMADD132PS(Mem{Base: RSP}.Offset(-32), Y1, Y6)
		VMULPS(Y6, Y6, Y1)
		VMOVUPS(Mem{Base: RSP}.Offset(-96), Y9)
		VFMADD213PS(Mem{Base: RSP}.Offset(-128), Y1, Y9)
		VFMADD213PS(Y11, Y1, Y9)
		VMULPS(Y1, Y1, Y7)
		VMOVAPS(Y10, Y12)
		VFMADD213PS(Y13, Y1, Y12)
		VFMADD231PS(Y7, Y9, Y12)
		VMOVAPS(Y14, Y7)
		VFMADD213PS(Y15, Y1, Y7)
		VFMADD213PS(Y2, Y1, Y7)
		VMULPS(Y6, Y1, Y1)
		VFMADD213PS(Y6, Y7, Y1)
		VPSLLD(Imm(29), Y3, Y6)
		VPAND(Y4, Y3, Y3)
		VPXOR(Y5, Y6, Y5)
		VPCMPEQD(Y4, Y3, Y6)
		VPCMPEQD(data.Offset(96), Y3, Y3)
		VANDPS(Y3, Y1, Y3)
		VANDPS(Y6, Y12, Y6)
		VADDPS(Y3, Y6, Y3)
		VADDPS(Y1, Y12, Y1)
		VPAND(Y5, Y8, Y5)
		VSUBPS(Y3, Y1, Y1)
		VPXOR(Y3, Y5, Y3)
		VPSLLD(Imm(29), Y0, Y0)
		VPADDD(Mem{Base: RSP}.Offset(-64), Y0, Y0)
		VPAND(Y0, Y8, Y0)
		VPXOR(Y0, Y8, Y0)
		VXORPS(Y0, Y1, Y0)
		VMOVDQU(Y3, Mem{Base: RDI}.Idx(RAX, 4))
		VMOVUPS(Y0, Mem{Base: RSI}.Idx(RAX, 4))
		ADDQ(Imm(8), RAX)
		CMPQ(RAX, R8)
		JB(LabelRef("LBB14_2"))
	}

	Label("LBB14_3")
	{
		CMPQ(R8, RCX)
		JAE(LabelRef("LBB14_16"))
		VBROADCASTSS(data.Offset(24), X0)
		VXORPS(X1, X1, X1)
		VMOVSS(data.Offset(60), X2)
		VMOVSS(data.Offset(44), X6)
		VMOVSS(data.Offset(28), X8)
		VMOVSS(data.Offset(36), X12)
		VMOVSS(data.Offset(40), X13)
		VMOVSS(data.Offset(48), X15)
		VMOVSS(data.Offset(52), X14)
		VMOVSS(data.Offset(56), X10)
		JMP(LabelRef("LBB14_5"))
	}

	Label("LBB14_15")
	{
		ADDQ(Imm(1), R8)
		CMPQ(R8, RCX)
		JAE(LabelRef("LBB14_16"))
	}

	Label("LBB14_5")
	{
		VMOVSS(Mem{Base: RDX}.Idx(R8, 4), X4)
		VXORPS(X0, X4, X5)
		VCMPSS(Imm(1), X1, X4, X7)
		VBLENDVPS(X7, X5, X4, X5)
		VUCOMISS(X2, X5)
		JA(LabelRef("LBB14_15"))
		VUCOMISS(X1, X4)
		SETCS(R9B)
		VMULSS(data.Offset(4), X5, X4)
		VCVTTSS2SI(X4, R10L)
		VROUNDSS(Imm(11), X4, X4, X4)
		MOVL(R10L, EAX)
		ANDL(Imm(1), EAX)
		JE(LabelRef("LBB14_8"))
		VADDSS(X6, X4, X4)
	}

	Label("LBB14_8")
	{
		ADDL(R10L, EAX)
		ANDL(Imm(7), EAX)
		LEAL(Mem{Base: RAX}.Offset(-4), R10L)
		CMPL(EAX, Imm(4))
		SETCC(R11B)
		CMOVLLT(EAX, R10L)
		VFMADD231SS(data.Offset(16), X4, X5)
		VMULSS(X5, X5, X7)
		VMOVAPS(X8, X11)
		VFMADD213SS(data.Offset(32), X7, X11)
		VFMADD213SS(X12, X7, X11)
		VMULSS(X7, X7, X9)
		VMOVAPS(X6, X4)
		VFMADD231SS(X13, X7, X4)
		VFMADD231SS(X9, X11, X4)
		VMOVAPS(X15, X3)
		VFMADD213SS(X14, X7, X3)
		VFMADD213SS(X10, X7, X3)
		VMULSS(X5, X7, X7)
		VFMADD213SS(X5, X3, X7)
		LEAL(Mem{Base: R10}.Offset(-1), EBX)
		CMPL(EBX, Imm(2))
		JB(LabelRef("LBB14_9"))
		VMOVAPS(X7, X5)
		VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4))
		VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4))
		CMPB(R9B, R11B)
		JNE(LabelRef("LBB14_12"))
		JMP(LabelRef("LBB14_13"))
	}

	Label("LBB14_9")
	{
		VMOVAPS(X4, X5)
		VMOVAPS(X7, X4)
		VMOVSS(X5, Mem{Base: RDI}.Idx(R8, 4))
		VMOVSS(X4, Mem{Base: RSI}.Idx(R8, 4))
		CMPB(R9B, R11B)
		JE(LabelRef("LBB14_13"))
	}

	Label("LBB14_12")
	{
		VMOVSS(Mem{Base: RDI}.Idx(R8, 4), X3)
		VXORPS(X0, X3, X3)
		VMOVSS(X3, Mem{Base: RDI}.Idx(R8, 4))
	}

	Label("LBB14_13")
	{
		CMPL(R10L, Imm(2))
		SETCC(BL)
		CMPL(EAX, Imm(4))
		SETCC(AL)
		CMPB(AL, BL)
		JE(LabelRef("LBB14_15"))
		VMOVSS(Mem{Base: RSI}.Idx(R8, 4), X3)
		VXORPS(X0, X3, X3)
		VMOVSS(X3, Mem{Base: RSI}.Idx(R8, 4))
		JMP(LabelRef("LBB14_15"))
	}

	Label("LBB14_16")
	{
		ADDQ(Imm(96), RSP)
		POPQ(RBX)
		VZEROUPPER()
		RET()
	}
}
